# 获取百度贴吧的QQ信息

import re
import ssl
import urllib.request
import os


def writeFile1Bytes(htmlBytes, toPath):
    with open(toPath, "wb") as f:
        f.write(htmlBytes)


def writeFile1Str(htmlBytes, toPath):
    with open(toPath, "wb") as f:
        f.write(htmlBytes)


def getHtmlBytes(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"
    }
    req = urllib.request.Request(url, headers=headers)
    context = ssl._create_unverified_context()
    response = urllib.request.urlopen(req, context=context)
    return response.read()


def qqCrawler(url, toPath):
    htmlBytes = getHtmlBytes(url)
    # writeFile1Bytes(htmlBytes,r"/Users/fwh/A_FWH/SourceTree/spider/temp/qqInfo.txt")
    # writeFile1Str(htmlBytes,r"C:\Users\admin\Desktop\360学习\爬虫\image\qq2.txt")
    htmlStr = str(htmlBytes)

    pat = r"[1-9]\d{4,9}"
    re_qq = re.compile(pat)
    qqsList = re_qq.findall(htmlStr)

    qqsList = list(set(qqsList))
    print(qqsList)
    saveData(qqsList, toPath)


def saveData(data, path):
    file_object = open(path, 'w')
    file_object.writelines(['%s%s' % (x, os.linesep) for x in data])
    file_object.close()


url = "http://tieba.baidu.com/p/5471533241?traceid="
# toPath = r"C:\Users\admin\Desktop\360学习\爬虫\image\qq.txt"
toPath = '/Users/fwh/A_FWH/SourceTree/spider/temp/qqInfo.txt'
qqCrawler(url, toPath)
